Introduction

In this Homework I will cluster the deputies from the biggest two parties depending on the voting, using two different distance calculations, and two different linkage methods, then I will visualize the data.

Data Loading

library(dplyr)
library(tidyr)
library(cluster)
library(ape)
library(RColorBrewer)

load("D:/MSc Computer science and informatics/3rd semester/Data Mining - Advances/Homeworks/all_votes.rda")

Data Preparation

firstly I will filter the data to keep the records for the biggest two parties “PO and PiS”.
Then I will count the distinct deputies to filter the votes which not exceed 75 % of the deputies.
to count the votes I will group by id_voting, then I will filter the data to get the final dataset I will use for clustering.

# filtering based on club value
mydata = all_votes[all_votes$club %in% c('PO', 'PiS'),1:7]

# get unique deputies according to the name and the club, because maybe the same name in different club
unique_deputies <- unique(mydata[c("surname_name", "club")])

# group votes

voting_count = mydata %>% 
  group_by(id_voting) %>%
  summarise(
    vote_count = n()
  )     

# find used votes in clustering
filtered_vote = voting_count[voting_count$vote_count > 0.75 * length(unique_deputies),]
# filter the dataset using the important votes "after filtration" 
mydata <- mydata[mydata$id_voting %in% filtered_vote$id_voting,]

# vote type coding
all_votes_new = mydata %>%
  mutate(
    voting_type = (ifelse(vote=='Against', -1, ifelse(vote=='For', 0, 1 )))
  )

all_votes_new = all_votes_new[,c("surname_name","club", "id_voting","voting_type")]

# create the final dataset using spread function
final_data = all_votes_new %>%
  spread(id_voting, voting_type, fill=0)

Distance Matrix Calculation

I will use two different functions to calculate the distance using Id_voting values as coordinates:
1- euclidian distances
2- Manhattan distances

distinct_id_voting = unique(all_votes$id_voting)
rownames(final_data) = paste(final_data$club, final_data$surname_name, sep="_")

# euclidian distances for original dataset
mat1 <- dist(final_data[,distinct_id_voting])
final_mat = as.matrix(mat1)


# Manhattan distances for original dataset
mat2 <- dist(final_data[,distinct_id_voting], method = 'manhattan')
final_mat2 = as.matrix(mat2)

Clustering

I will cluster the data using two different linkage method
1- average, I will apply it on the distance matrix I created using the euclidian distance function
2- complete, I will apply it on the distance matrix I created using the manhattan distance function

final_data$colr <- factor(ifelse(final_data$club == 'PO', 1, 2)) 
final_data2 <-final_data
# Using Average
hc <- agnes(final_mat, method="average")
plot(hc, which.plots=2, cex=0.5, main="")

final_data$labels = factor(cutree(hc, k=4))

# Using complete
hc2 <- agnes(final_mat2, method="complete")
plot(hc2, which.plots=2, cex=0.5, main="")

final_data2$labels = factor(cutree(hc2, k=4))

Clusters Visualization

I have clustered all the deputies and used their names and club as the row name.
Red Color for PO party
Blue Color for PiS party

Clusters Using Euclidian Distance And Average Linkage

cols <- brewer.pal(3,"Set1")
hc <- as.phylo(as.hclust(agnes(final_data, method="complete")))

par(mar=c(1,1,2,1), xpd=NA)

plot(hc, type = "fan", cex = 0.8,
     tip.color = cols[final_data$colr])

plot(as.phylo(hc), type = "unrooted", cex = 0.8,
     tip.color = cols[final_data$colr])

plot(as.phylo(hc), type = "radial", cex = 0.8,
     tip.color = cols[final_data$colr])

plot(as.phylo(hc), type = "phylogram", cex = 0.8,
     tip.color = cols[final_data$colr])

plot(as.phylo(hc), type = "cladogram", cex = 0.8,
     tip.color = cols[final_data$colr])

Clusters Using Manhattan Distance And Complete Linkage

cols <- brewer.pal(3,"Set1")
hc2 <- as.phylo(as.hclust(agnes(final_data2, method="complete")))

par(mar=c(1,1,2,1), xpd=NA)

plot(hc2, type = "fan", cex = 0.8,
     tip.color = cols[final_data2$colr])

plot(as.phylo(hc2), type = "unrooted", cex = 0.8,
     tip.color = cols[final_data2$colr])

plot(as.phylo(hc2), type = "radial", cex = 0.8,
     tip.color = cols[final_data2$colr])

plot(as.phylo(hc2), type = "phylogram", cex = 0.8,
     tip.color = cols[final_data2$colr])

plot(as.phylo(hc2), type = "cladogram", cex = 0.8,
     tip.color = cols[final_data2$colr])

Remarks

Because of the big number of deputies, the visualization will not be clear, and we can filter the deputies more to get more clear results.